I wrote the following code snippets while reading the book "Think Stats" by Allen B. Downey which aim is given by its subtitle "Probability and Statistics for Programmers". Initially this notebook served for me as a playground to reproduce various topics of the book. Now it's a quick reminder of statistical concepts and syntax of tools for it.
(c) Florian Hoppe 2013
In [3]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as rand
from datetime import datetime, timedelta
import dateutil
from scipy.stats import pareto
import scipy.stats
import random as rnd
In [4]:
%pylab inline
In [5]:
a, m = 1.7, 150.
s = Series(np.random.pareto(a, 10000000) + m)
print("%f < x < %f" % (min(s),max(s)))
In [6]:
s.describe()
Out[6]:
In [7]:
count, bins, ignored = plt.hist(s, 100, normed=True)
fit = a*m**a/bins**(a+1)
plt.xlim([0,1000])
plt.plot(bins, max(count)*fit/max(fit),linewidth=2, color='r')
plt.show()
In [8]:
rv = pareto([.5,] * scipy.stats.pareto.numargs)
In [51]:
s_w = Series(np.random.weibull(2.5,100000))
s_n = Series(np.random.randn(100000))
In [52]:
hist_w = np.histogram(s_w,bins=100)
hist_n = np.histogram(s_n,bins=100)
In [53]:
plot(hist_w[0],'r',hist_n[0],'b')
Out[53]:
In [56]:
# Pearson skewness is better for real world data as it's more robust against outlieres:
def pearson_skew(arr):
return 3*(arr.mean()-arr.median())/arr.var()
print("mean: %f, median: %f, skew: %f, Pearson skewness: %f" % (s_w.mean(), s_w.median(), scipy.stats.skew(s_w), pearson_skew(s_w)))
In [6]:
plt.bar(hist_n[1][:-1],hist_n[0],width=.1)
Out[6]:
In [7]:
CDF_w = Series(hist_w[0]).cumsum()
CDF_n = Series(hist_n[0]).cumsum()
plot(CDF_w,'r',CDF_n,'b')
Out[7]:
In [172]:
CDF_w.sum()
Out[172]:
In [14]:
nrm = scipy.stats.norm(100, 15)
In [15]:
plot(nrm.pdf(np.arange(-10,10,.1)),'r', nrm.cdf(np.arange(-10,10,.1)),'b')
Out[15]:
In [195]:
6*10**9*nrm.pdf(190)
Out[195]:
In [203]:
n = 4
std([max(random.normal(950,50,n)) for _ in range(365)])
Out[203]:
In [11]:
#plot(random.normal(950,50,365),'g',random.normal(1002,35,365),'r')
norm950 = scipy.stats.norm(950, 50)
norm1000 = scipy.stats.norm(1002, 35)
xs = np.arange(800,1200)
plot(xs,norm950.pdf(xs),'r', xs,norm1000.pdf(xs),'b')
Out[11]:
In [233]:
flipping_15_coins = scipy.stats.binom(15,.5)
flipping_15_coins.pmf(10) # propability that 10 flips yield head
Out[233]:
In [17]:
# creating a
import scipy.stats as stats
x = np.linspace(0, 1, 100)
plot(x,stats.beta.pdf(x,3,5),x,stats.beta.pdf(x,5,5),x,stats.beta.pdf(x,5,3),x,stats.beta.pdf(x,1,5),x,stats.beta.pdf(x,5,1))
Out[17]:
In [74]:
ser_twonormals = Series(random.normal(10,1,size=(10000)) + random.normal(50,1,size=(10000)))
ser_twonormals.hist(bins=100)
ser_twonormals.describe()
Out[74]:
In [75]:
ser_twonormals = Series(np.concatenate([random.normal(10,1,size=(10000)), random.normal(50,1,size=(10000))]))
ser_twonormals.hist(bins=100)
ser_twonormals.describe()
Out[75]:
In [91]:
cd /Users/florianhoppe/Google Drive/Wissen/Books/thinkstats/thinkstats_resources
In [112]:
import survey_using_panda
preg = Pregnancies()
preg.ReadRecords()
#len(preg.records)
Out[112]:
In [166]:
preg_columns = ['caseid','nbrnaliv','babysex','birthwgt_lb','birthwgt_oz','prglength','outcome','birthord','agepreg','finalwgt']
data = []
for record in preg.records:
data += [[getattr(record,field) for field in preg_columns]]
data = DataFrame(data,columns=preg_columns)
print(data[:3])
In [322]:
#data.birthord[data.birthord=='NA'] = np.NAN
data.birthord.value_counts()
Out[322]:
In [168]:
reported_birth = data.ix[data.birthord!='NA',:]
In [176]:
first_born = data.ix[data.birthord==1,:]
others = data.ix[(data.birthord!=1) & (data.birthord!='NA'),:]
In [327]:
field_to_check = 'finalwgt'#'prglength'
diff_of_means = getattr(first_born,field_to_check).mean()-getattr(others,field_to_check).mean()
# resampling
n = 1000
fb_samples_prglength = getattr(first_born,field_to_check).ix[first_born.index[rnd.sample(xrange(len(first_born)),n)]]
o_samples_prglength = getattr(others,field_to_check).ix[others.index[rnd.sample(xrange(len(others)),n)]]
diff_of_samples = Series(np.array(fb_samples_prglength.astype(float)) - np.array(o_samples_prglength.astype(float)))
diff_of_samples.hist(bins=100)
diff_of_samples.describe()
print("In %.0f %% cases the difference of the field %s is larger then the mean difference %f." % (float(100.0*(diff_of_samples>diff_of_means).sum())/n,field_to_check,diff_of_means))
In [333]:
first_born[['prglength','finalwgt']].describe()
Out[333]:
In [329]:
others[['prglength','finalwgt']].describe()
Out[329]:
In [331]:
first_born[['prglength','finalwgt']].describe()-others[['prglength','finalwgt']].describe()
Out[331]:
In [31]:
def tp():
ser = Series(np.random.poisson(10,1000))
return ser[ser<=10].count()
mean([tp() for _ in range(100)])
Out[31]:
In [3]:
ser = Series(np.random.poisson(10,1000))
In [35]:
ser.hist()
Out[35]:
In [37]:
# it's right-side skewed
scipy.stats.skew(ser)
Out[37]:
In [70]:
p_ser = []
for val in ser.unique():
p_of_val = ser[ser==val].count()/float(len(ser))
p_ser += [p_of_val]
plt.bar(list(ser.unique()),p_ser)
Out[70]:
In [63]:
ser.hist(bins=len(ser.unique()))
Out[63]:
In [43]:
rv = scipy.stats.poisson(10)
rv.cdf(10)
Out[43]:
In [92]:
lam = 10
n = 10000
ser = Series(np.random.poisson(lam,n))
ser.describe()
Out[92]:
In [51]:
p_below_mean = ser[ser<=ser.mean()].count().astype(float)/len(ser)
print("Percent of the samples are below the mean: %.0f %%" % (p_below_mean*100))
print("This equals (by definition) the value of the cummultative density function of the same poisson distribution: %.0f %%" % (100*scipy.stats.poisson(lam).cdf(lam)))
print("And the inverse of the mass function of that percentage is again the lambda: %.1f" % scipy.stats.poisson.ppf(p_below_mean,lam))
In [109]:
scipy.stats.poisson.ppf(scipy.stats.poisson(lam).cdf(lam),lam) == lam
Out[109]:
In [99]:
sorted_ser = np.sort(ser)
plot(arange(0,1,.01),scipy.stats.poisson.ppf(arange(0,1,.01),lam),'rx',arange(0,1,1.0/len(sorted_ser)),sorted_ser,'b')
Out[99]:
In [100]:
scipy.stats.poisson.ppf(.2,lam)
Out[100]:
In [102]:
# after sorting the index (id sorted_ser[i] or sorted_ser.ix[i]) still references the old order:
sorted_ser.values[.2 * len(sorted_ser)]
Out[102]:
In [142]:
dist_percentage = 0.97
lowb, highb = scipy.stats.poisson.interval(dist_percentage,lam)
print("To get at least %.0f %% of a poisson distribution with lambda %i the given event must be between %i and %i." % (dist_percentage,lam,lowb,highb))
print("For this interval the sample series contains %.1f %% events" % (ser[(ser>=lowb) & (ser<=highb)].count()*100.0/len(ser)))
In [70]:
# testing if the number of occurences matches expected ones:
alpha = 0.05
observed = np.array([126,174]) # absolut value of occurences of two events
expected = np.array([3.0/7, 4.0/7]) * observed.sum() # expected are here primarily given as percents/ration
X_squared, p_value = scipy.stats.chisquare(observed,expected)
print("The observed occurences have a propability of %.1f %%. The X^2 difference between observed and expected is %.2f." % (100*p_value, X_squared))
print(" => with significance niveau %.2f the null hypothesis (observed mean matches assumed one) is %s" % (alpha,"accepted" if p_value>alpha else "rejected"))
In [69]:
# t test: testing how a observed mean relates to a assumed (== true) mean ie. if it's equal, greater or less
# TODO add assumptions on distributions and ways to performe less and greater tests
true_mean = 10
alpha = 0.05
ser = random.normal(true_mean,1,size=(100000))
err,p_value = scipy.stats.ttest_1samp(ser,true_mean)
print("Given the sample series it's %.1f %% likely that the mean of the underlying distribution is %.0f. Error is %.4f" % (p_value*100,true_mean,err))
print(" => with significance niveau %.2f the null hypothesis (observed mean matches assumed one) is %s" % (alpha,"accepted" if p_value>alpha else "rejected"))
test_mean = 9.9
err,p_value = scipy.stats.ttest_1samp(ser,test_mean)
print("Given the sample series it's %.1f %% likely that the mean of the underlying distribution is %.0f." % (p_value*100,test_mean))
print(" => with significance niveau %.2f the null hypothesis (observed mean matches assumed one) is %s" % (alpha,"accepted" if p_value>alpha else "rejected"))
In [ ]: